Decision Tress

Udacity: Intro to Machine Learning

Chapter 4

Note: Since required dependencies are not available, this is just a code snippet for now (instead of executing in cell blocks).

Finding Accuracy:

import sys
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData
from sklearn import tree
from sklearn.metrics import accuracy_score

import numpy as np
import pylab as pl

features_train, labels_train, features_test, labels_test = makeTerrainData()



#################################################################################


########################## DECISION TREE #################################

def classify(features_train, labels_train):

    ### your code goes here--should return a trained decision tree classifer
    X = features_train
    Y = labels_train
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X,Y)
    return clf

#### your code goes here
clf = classify(features_train, labels_train)


# ask classifier to predict
labels_pred = clf.predict(features_test)

acc = accuracy_score(labels_test, labels_pred)
### be sure to compute the accuracy on the test set



def submitAccuracies():
  return {"acc":round(acc,3)}

Output: {"message": "{'acc': 0.908}"}

Minimum Samples (2 and 50):

import sys
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
from sklearn.metrics import accuracy_score

features_train, labels_train, features_test, labels_test = makeTerrainData()



########################## DECISION TREE #################################


### your code goes here--now create 2 decision tree classifiers,
### one with min_samples_split=2 and one with min_samples_split=50
### compute the accuracies on the testing data and store
### the accuracy numbers to acc_min_samples_split_2 and
### acc_min_samples_split_50, respectively
def classify(features_train, labels_train, min_samples=2):

    ### your code goes here--should return a trained decision tree classifer
    X = features_train
    Y = labels_train
    clf = tree.DecisionTreeClassifier(min_samples_split=min_samples)
    clf = clf.fit(X,Y)
    return clf

clf_2 = classify(features_train, labels_train, 2)
clf_50 = classify(features_train, labels_train, 50)

labels_pred_2 = clf_2.predict(features_test)
labels_pred_50 = clf_50.predict(features_test)

acc_min_samples_split_2 = accuracy_score(labels_test, labels_pred_2)
acc_min_samples_split_50 = accuracy_score(labels_test, labels_pred_50)


def submitAccuracies():
  return {"acc_min_samples_split_2":round(acc_min_samples_split_2,3),
          "acc_min_samples_split_50":round(acc_min_samples_split_50,3)}

Output: {"message": "{'acc_min_samples_split_50': 0.912, 'acc_min_samples_split_2': 0.908}"}

Entropy

In [1]:
from IPython.display import YouTubeVideo
# Entropy case https://youtu.be/Bd15qhUrKCI
YouTubeVideo('Bd15qhUrKCI')
Out[1]:
In [2]:
# Entrop case part 2 https://youtu.be/L6J6BRFgDiI
YouTubeVideo('L6J6BRFgDiI')
Out[2]:

Entropy Calculation for below case: 2018-06-17_16h19_43.png

In [3]:
from math import log

parent_entropy = -(0.5*log(0.5,2) + 0.5*log(0.5,2))  # log base 2, so 2nd argument
parent_entropy  # maximally impure
Out[3]:
1.0

Information gain - based on grade 2018-06-17_16h29_23.png

In [4]:
child_entropy_1 = -(2/3)*log(2/3,2)-(1/3)*log(1/3,2)
child_entropy_1
Out[4]:
0.9182958340544896
In [5]:
child_entropy_2 = 0  # as its only 'f'

information_gain = parent_entropy - ( (3/4)*child_entropy_1 + (1/4)*child_entropy_2 )
information_gain
Out[5]:
0.31127812445913283

Information gain - based on bumpiness 2018-06-17_17h11_58.png

In [6]:
from graphviz import Digraph
g = Digraph()
g.node('A','ssff',xlabel='P')
g.node('B','sf',xlabel='C1')
g.node('C','sf',xlabel='C2')
g.edge('A','B',label='bumpy')
g.edge('A','C',label='smooth')
g
Out[6]:
%3 A ssff P B sf C1 A->B bumpy C sf C2 A->C smooth
In [7]:
# Parent P : ssff
entropy_P = -(0.5*log(0.5,2) + 0.5*log(0.5,2))

# Child C1 : sf
entropy_C1 = -(0.5*log(0.5,2) + 0.5*log(0.5,2))

# Child C2 : sf
entropy_C2 = -(0.5*log(0.5,2) + 0.5*log(0.5,2))
In [8]:
information_gain = entropy_P - ( (2/4)*entropy_C1 + (2/4)*entropy_C2 )
information_gain
Out[8]:
0.0

Information gain - based on speed limit 2018-06-17_18h32_54.png

In [9]:
from graphviz import Digraph
g = Digraph()
g.node('A','ssff',xlabel='P')
g.node('B','ss',xlabel='C1')
g.node('C','ff',xlabel='C2')
g.edge('A','B',label='Speed limit yes')
g.edge('A','C',label='Speed limit no')
g
Out[9]:
%3 A ssff P B ss C1 A->B Speed limit yes C ff C2 A->C Speed limit no
In [10]:
# Parent P : ssff
entropy_P = -(0.5*log(0.5,2) + 0.5*log(0.5,2))

# Child C1 : ss
entropy_C1 = -(1*log(1,2))

# Child C2 : ff
entropy_C2 = -(1*log(1,2))
In [11]:
information_gain = entropy_P - ( (2/4)*entropy_C1 + (2/4)*entropy_C2 )
information_gain
Out[11]:
1.0

DT Mini Project

Email classification.
Starter skeleton code is here

In [12]:
#!/usr/bin/python

""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.

    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn import tree
from sklearn.metrics import accuracy_score


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()  # 10 is percentile




#########################################################
### your code goes here ###
def classify(features_train, labels_train, min_samples=2):

    ### your code goes here--should return a trained decision tree classifer
    X = features_train
    Y = labels_train
    clf = tree.DecisionTreeClassifier(min_samples_split=min_samples)

    # train the classifier
    t0 = time()
    clf = clf.fit(X,Y)
    print("Decision Tree Training time: " + str(round(time()-t0,3)) + " s")    
    return clf

clf_40 = classify(features_train, labels_train, 40)

# predict
t0 = time()
labels_pred_40 = clf_40.predict(features_test)
print("Decision Tree Prediction time: " + str(round(time()-t0,3)) + " s")

# accuracy
acc_min_samples_split_40 = accuracy_score(labels_test, labels_pred_40)

print("Decision Tree Predicted labels: " + str(len(labels_pred_40)))
print("Decision Tree Accuracy: " + str(accuracy_score(labels_test, labels_pred_40)))

#########################################################
C:\Users\parthi2929\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
no. of Chris training emails: 7936
no. of Sara training emails: 7884
Decision Tree Training time: 66.348 s
Decision Tree Prediction time: 0.04 s
Decision Tree Predicted labels: 1758
Decision Tree Accuracy: 0.9789533560864618

Speeding it up

No of features in the data:

In [13]:
features_train, features_test, labels_train, labels_test = preprocess()

len(features_train[0])
no. of Chris training emails: 7936
no. of Sara training emails: 7884
Out[13]:
3785

After instructed by udacity mini project _"go into tools/email_preprocess.py, and find the line of code that looks like this: selector = SelectPercentile(fclassif, percentile=1) Change percentile from 10 to 1."

ipython has to reload, which means above code also produces same result as below, so modified processor to take that as argument.

In [14]:
from email_preprocess import preprocess
features_train, features_test, labels_train, labels_test = preprocess(percentile=1)  # now percentile = 1

len(features_train[0])
no. of Chris training emails: 7936
no. of Sara training emails: 7884
Out[14]:
379

Calculating accuracy again with selector percentil as 1

In [15]:
#!/usr/bin/python

""" 
    This is the code to accompany the Lesson 2 (SVM) mini-project.

    Use a SVM to identify emails from the Enron corpus by their authors:    
    Sara has label 0
    Chris has label 1
"""

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn import tree
from sklearn.metrics import accuracy_score


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(percentile=1)




#########################################################
### your code goes here ###
def classify(features_train, labels_train, min_samples=2):

    ### your code goes here--should return a trained decision tree classifer
    X = features_train
    Y = labels_train
    clf = tree.DecisionTreeClassifier(min_samples_split=min_samples)

    # train the classifier
    t0 = time()
    clf = clf.fit(X,Y)
    print("Decision Tree Training time: " + str(round(time()-t0,3)) + " s")    
    return clf

clf_40 = classify(features_train, labels_train, 40)

# predict
t0 = time()
labels_pred_40 = clf_40.predict(features_test)
print("Decision Tree Prediction time: " + str(round(time()-t0,3)) + " s")

# accuracy
acc_min_samples_split_40 = accuracy_score(labels_test, labels_pred_40)

print("Decision Tree Predicted labels: " + str(len(labels_pred_40)))
print("Decision Tree Accuracy: " + str(accuracy_score(labels_test, labels_pred_40)))

#########################################################
no. of Chris training emails: 7936
no. of Sara training emails: 7884
Decision Tree Training time: 4.572 s
Decision Tree Prediction time: 0.016 s
Decision Tree Predicted labels: 1758
Decision Tree Accuracy: 0.9670079635949943

So concluding when no of features reduced, the accuracy reduced.